{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generating figures from methods document"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlite3\n",
    "import random\n",
    "import itertools\n",
    "import subprocess\n",
    "import os\n",
    "import shlex\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# path on rte\n",
    "db_path = os.path.expanduser(\"~/data/db/arxiv_db_images.sqlite3\")\n",
    "\n",
    "# Here we import the sqlite3 database and create a cursor\n",
    "db = sqlite3.connect(db_path)\n",
    "c = db.cursor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# go through all of the retrieved SQL rows and format as an accreditation\n",
    "# if html is True, format with <a href=####> for web usage\n",
    "# input = [[author, title, date, identifier], [author, title, date, identifier],...[author, title, date, identifier]]\n",
    "\n",
    "def format_credits(rows, html=False):\n",
    "    accreditations = []\n",
    "\n",
    "    empty_counter = 0\n",
    "    \n",
    "    # html = True\n",
    "#     html = False\n",
    "\n",
    "    # grab the details and check each\n",
    "    for i, row in enumerate(rows):\n",
    "        print(i)\n",
    "        if row:\n",
    "            s = row[0]\n",
    "            start = \"['\"\n",
    "            end = \"']\"\n",
    "            # if the author has been written to the SQLite database with parenthesis\n",
    "            if s.find(start) != -1 and s.rfind(end) != -1:\n",
    "                author = s[s.find(start)+len(start):s.rfind(end)-len(end)]\n",
    "            else:\n",
    "                author = s[:]\n",
    "            print(author)\n",
    "\n",
    "            title = row[1]\n",
    "            # replace line breaks and double spaces\n",
    "            title = title.replace(\"\\n\", \"\").replace(\"  \",\" \")\n",
    "            print(title)\n",
    "\n",
    "            date = row[2].split(\"-\")[0]\n",
    "            print(date)\n",
    "\n",
    "            identifier = row[3]\n",
    "            print(identifier)\n",
    "        #     reg_exp = re.compile(\"/[^\\d]\\d{2}[^\\d]/\")\n",
    "            longest_digits = max(re.findall(r'\\d+', identifier), key = len)\n",
    "            print(len(longest_digits))\n",
    "\n",
    "            # if the identifier contains seven consecutive numbers, add a slash\n",
    "            if len(longest_digits) == 7:\n",
    "                print(\"----- regex match -----\")\n",
    "                reverse = identifier[::-1]\n",
    "                print(reverse)\n",
    "                identifier_reverse = reverse[:7] + \"/\" + reverse[7:]\n",
    "                identifier = identifier_reverse[::-1]\n",
    "            else:\n",
    "                # otherwise we can leave the identifier how it is\n",
    "                print(\"----- no match -----\")\n",
    "            print(identifier)\n",
    "            url = \"https://arxiv.org/abs/\" + identifier\n",
    "            print(url)\n",
    "            print(\"*\" * 20)\n",
    "        #     print(row[0])\n",
    "\n",
    "            # format string and append\n",
    "            if html:\n",
    "                fmt_str = '{}: {}, {}, <a href=\"{}\">{}</a>'\n",
    "                accreditations.append(fmt_str.format(author, title, str(date), url, url))  \n",
    "            else:\n",
    "                fmt_str = '{}: {}, {}, {}'\n",
    "                accreditations.append(fmt_str.format(author, title, str(date), url, url))\n",
    "        else:\n",
    "            empty_counter += 1\n",
    "            print(\"empty!\")\n",
    "\n",
    "    print(\"number of empty slots:\", empty_counter)\n",
    "    return accreditations"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Get random images\n",
    "\n",
    "Limit selection to October 2012 where possible, otherwise all of 2012. Grab all the images from that category and date range and then shuffle and extract. This uses a random seed for reproducible results.\n",
    "\n",
    "Run one of the following SQL queries and then skip down to the montage generation code."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_images = 144\n",
    "# num_images = 16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get random images\n",
    "\n",
    "name = f'random_montage_{num_images}_oct2012'\n",
    "\n",
    "sql = (\"SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id, metadata.id \"\n",
    "    \"FROM images \"\n",
    "    \"LEFT JOIN metadata ON images.identifier = metadata.identifier \"\n",
    "    \"WHERE images.x != '' \"\n",
    "    \"AND metadata.created BETWEEN date('2012-10-01') \"\n",
    "    \"AND date('2012-10-31') \")\n",
    "\n",
    "c.execute(sql, )\n",
    "rows = c.fetchall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get images from cs.CV from October 2012\n",
    "\n",
    "name = \"csCV_montage_144_oct2012\"\n",
    "\n",
    "sql = (\"SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id, metadata.id \"\n",
    "    \"FROM images \"\n",
    "    \"LEFT JOIN metadata ON images.identifier = metadata.identifier \"\n",
    "    \"WHERE images.x != '' \"\n",
    "    \"AND metadata.created BETWEEN date('2012-10-01') \"\n",
    "    \"AND date('2012-10-31') \"\n",
    "    \"AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? \")\n",
    "\n",
    "c.execute(sql, (\"cs.CV\", ))\n",
    "rows = c.fetchall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get images from stat.ML\n",
    "\n",
    "name = \"statML_montage_144_oct2012\"\n",
    "\n",
    "sql = (\"SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id, metadata.id \"\n",
    "    \"FROM images \"\n",
    "    \"LEFT JOIN metadata ON images.identifier = metadata.identifier \"\n",
    "    \"WHERE images.x != '' \"\n",
    "    \"AND metadata.created BETWEEN date('2012-10-01') \"\n",
    "    \"AND date('2012-10-31') \"\n",
    "    \"AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ? \")\n",
    "\n",
    "c.execute(sql, (\"stat.ML\", ))\n",
    "rows = c.fetchall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get all images from cs.AI from 2012 (whole year)\n",
    "\n",
    "name = \"csAI_montage_144_all2012\"\n",
    "\n",
    "sql = (\"SELECT metadata.cat, images.path, images.filename, images.identifier, metadata.created, images.id, metadata.id \"\n",
    "    \"FROM images \"\n",
    "    \"LEFT JOIN metadata ON images.identifier = metadata.identifier \"\n",
    "    \"WHERE images.x != '' \"\n",
    "    \"AND metadata.created BETWEEN date('2012-01-01') \"\n",
    "    \"AND date('2012-12-31') \"\n",
    "    \"AND substr(trim(metadata.cat),1,instr(trim(metadata.cat)||' ',' ')-1) = ?\")\n",
    "\n",
    "c.execute(sql, (\"cs.AI\", ))\n",
    "rows = c.fetchall()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'Total number of images in category/date range: {len(rows)}')\n",
    "for row in rows[:3]:\n",
    "    print(row)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Shuffle and slice entries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# shuffle first, then get ids and paths\n",
    "# keep the seed as \"4\" so that results are reproducible\n",
    "random.seed(4)\n",
    "random.shuffle(rows)\n",
    "\n",
    "meta_ids = []\n",
    "ids = []\n",
    "filepaths = []\n",
    "\n",
    "for row in rows[:num_images]:\n",
    "#     print(row[5])\n",
    "    ids.append(row[5])\n",
    "    meta_ids.append(row[6])\n",
    "    path = row[1] + '/' + row[2]\n",
    "#     print(path)\n",
    "    filepaths.append(path)\n",
    "print(f'Number of image IDs selected: {len(ids)}')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# sample of IDs\n",
    "print(meta_ids[:3])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Export montage\n",
    "\n",
    "Saves an image using the imagemagick `montage` command."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get a montage of some images\n",
    "# change output path here\n",
    "outputname = [os.path.expanduser(\"~/documentation/data-samples/\") + name + \".jpg\"]\n",
    "print(outputname)\n",
    "\n",
    "# move to the source directory, fix for montage to work properly\n",
    "os.chdir(os.path.expanduser(\"~/arXiv/\"))\n",
    "print(os.getcwd())\n",
    "\n",
    "prearg = shlex.split(\"-colorspace CMYK\")\n",
    "\n",
    "assert num_images == 144 or num_images == 16, \"num_images must be either 16 or 144 for montage grid!\"\n",
    "if num_images == 144:\n",
    "    # for bigger montage of 12x12\n",
    "    arguments = shlex.split(\"-colorspace sRGB -background white -alpha background -geometry 240x240+2+2 -tile 12x\")\n",
    "elif num_images == 16:\n",
    "    # for smaller montage of 4x4\n",
    "    arguments = shlex.split(\"-colorspace sRGB -background white -alpha background -geometry 480x480+2+2 -tile 4x\")\n",
    "    \n",
    "# print(arguments)\n",
    "\n",
    "filelist = []\n",
    "# this takes a slice of the larger shuffled list\n",
    "for filepath in itertools.islice(filepaths, 0, 144):\n",
    "#     print(filepath)\n",
    "    # put the filepath into the list but add the directory, remove the dot\n",
    "    # and also add [0] to only use the first page of multi-page image documents\n",
    "    filelist.append(\"src_all\" + filepath.replace('./','/') + '[0]')\n",
    "           \n",
    "# print(filelist)\n",
    "\n",
    "# call the montage command and parse list of files and arguments\n",
    "montage_cmd = [\"montage\"] + prearg + filelist + arguments + outputname\n",
    "\n",
    "result = subprocess.Popen(montage_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n",
    "out, err = result.communicate()\n",
    "print(out)\n",
    "print(err)\n",
    "print(\"subprocess finished\")\n",
    "print(\"-\" * 40)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Write credits for images\n",
    "\n",
    "Search for a specific set of entries and retrieve metadata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# this is for the image identifier\n",
    "# author, title, date, identifier\n",
    "\n",
    "rows = []\n",
    "\n",
    "sql = (\"SELECT metadata.authors, metadata.title, metadata.created, metadata.identifier \"\n",
    "    \"FROM metadata \"\n",
    "    \"WHERE metadata.id is ? \")\n",
    "\n",
    "# iterate over the file_ids list and grab sql data\n",
    "for file_id in meta_ids[:]:\n",
    "    print(\"image parent article id:\",file_id)\n",
    "    c.execute(sql, (file_id, ))\n",
    "    row = c.fetchall()\n",
    "    rows.append(row[0])\n",
    "    print(row[0])\n",
    "\n",
    "accreditations = format_credits(rows, False)\n",
    "\n",
    "os.chdir(os.path.expanduser(\"~/documentation/data-samples/\"))\n",
    "print(os.getcwd())\n",
    "\n",
    "f = open(name + \".txt\", \"w+\")\n",
    "for row in accreditations:\n",
    "    f.write(str(row) + \"\\n\")\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
